/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 12/28/2022
Last Modified on: 3/28/2023

Description: This program generates an investigator-level dataset that contains
information on investigators' characteristics such as their race and gender, 
urbanicity of the county where they work, etc. The program takes as an input the
dataset "final_prediction_race_gender." This dataset was created by researchers 
at CADL with the help of our RAs, and predicts the gender and race on the investigator 
based on their first and last names. We describe the procedure for the prediction 
in the paper.

Note that we have removed the file directory names from this program for 
confidentiality reasons. 
********************************************************************************/

**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close

*Set directories
global clean 
global cleand 
global tmpd 
global rawd 

**************************
**(1) GENERATE LIMITED ANALYSIS SAMPLE 
**************************
use "${clean}child_investigation_sample.dta", clear //sample of investigations from 2008 to 2017//

*Drop duplicates by child and investigation - should be a child X inv sample but it's currently set up as a panel 
gegen tag=tag(vicid inv_caseid)
keep if tag==1 

**Main sample restrictions
*Drop sexual abuse cases 
drop if sexab==1 

*Drop observations with missing zipcodes 
drop if zipcode_vic==. 

*Keep only white and black children
keep if white==1 | black==1 

*Limit to investigators with at least 200 cases
bysort worker_id: gen n=_N
drop if n<200

*Generate rotation and drop "trivial rotations"
egen rotationgroup = group(zipcode_vic cps_year)
bysort rotationgroup: gen nobs = _N 
tab nobs if nobs<10 
drop if nobs ==1 

*Drop investigators who were *only* assigned to white or black children 
gegen tmp = var(white), by(worker_id)
order tmp 
drop if tmp==0 
drop tmp 

gegen tmp = var(black), by(worker_id)
order tmp 
drop if tmp==0 

*Drop observations that we can't follow for at least six months 
drop if postm1_inv==. | postm2_inv==. | postm3_inv==. | postm4_inv==. | postm5_inv==. | postm6_inv==. 
rename black d_black

*Generate main outcomes 
forvalues j = 1(1)12 {
gen inv`j'm = 0 
forvalues i = 1/`j' {
	sum postm`i'_inv
	replace inv`j'm = 1 if postm`i'_inv==1
	sum inv`j'm* 
}
}

foreach x in inv1m inv2m inv3m inv4m inv5m inv6m {
	replace `x'=. if fc==1
}

sum inv*m

*Generate remaining variables
gen nofc=fc==0

//Count of cases
cap drop count_inv
bys worker_id: gen long count_inv = _N

// Count of cases by investigator by race:
*rename pre_black d_black
bys worker_id: egen count_black = total(d_black)
gen nonblack = (d_black==0)
bys worker_id: egen count_white = total(nonblack)
bys worker_id: egen share_black=mean(d_black)

sum d_black 
local bshare = r(mean)
gen bshare = `bshare'
drop pre_black 
rename d_black pre_black 

save "${cleand}child_investigation_analysis_sample.dta", replace 


**************************
**(2) GENERATE KEY INVESTIGATOR-LEVEL COVARIATES
**************************
use "${cleand}child_investigation_analysis_sample.dta", clear

*Generate remaining variables
gsort worker_id complaint_date 
order worker_id complaint_date 

//Count of cases
cap drop count_inv
bys worker_id: gen long count_inv = _N

//Share of total cases for black children
cap drop n 
bysort worker_id: gen n = _N 
bysort worker_id pre_black: gen nb = _N 
order n nb pre_black 

cap drop nblack* nwhite* 

foreach x in nblack {
gen `x'_tmp = nb if pre_black==1 
gegen `x' = max(`x'_tmp), by(worker_id)
}

foreach x in nwhite {
gen `x'_tmp = nb if pre_black==0
gegen `x' = max(`x'_tmp), by(worker_id)
}

order nblack nwhite 

gen sh_black = nblack/n 
gen sh_white = nwhite/n

cap drop tag
egen tag = tag(worker_id)

sum sh_black sh_white count_inv if tag 

//Newly-hired employees during our sample 
gen complaint_year = year(complaint_date)
tab complaint_year 

egen first_year = min(complaint_year), by(worker_id)
order first_year 
gen newemp = first_year >2008
tab newemp if tag 

//generate modal worker county 
gsort worker_county 
egen countynum = group(worker_county)
gegen mcounty = mode(worker_county), by(worker_id)
gegen mcountyn = mode(countynum), by(worker_id)

gduplicates drop worker_id, force 

keep worker_id sh_black count_inv newemp mcounty mcountyn 

cap drop urban 
gen urban = mcounty=="Ingham County" | mcounty=="Kent County" | mcounty=="Genesee County" | mcounty=="Macomb County" | mcounty=="Oakland County" | mcounty=="Wayne County" 

//investigator race and gender 
preserve 
use "${rawd}final_prediction_race_gender.dta", clear 
gduplicates drop id, force 
rename id worker_id 
tempfile tmp 
save `tmp', replace 
restore 

merge 1:1 worker_id using `tmp', keep(1 3)
drop fname lname 
drop _merge 
rename (python fname) (race gender)

tab race
tab gender 

gen whiteinv = race=="white"
gen blackinv = race=="black"
gen otherinv = whiteinv==0 & blackinv==0

gen femaleinv = gender=="female"

keep worker_id sh_black count_inv newemp urban whiteinv blackinv otherinv femaleinv

save "${tmpd}inv_covariates.dta", replace 
